package org.apache.lucene.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.Closeable;
import java.io.IOException;
import java.util.Arrays;

import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IOUtils;

class TermVectorsReader implements Cloneable, Closeable {

    // NOTE: if you make a new format, it must be larger than
    // the current format
    static final int FORMAT_VERSION = 2;

    // Changes to speed up bulk merging of term vectors:
    static final int FORMAT_VERSION2 = 3;

    // Changed strings to UTF8 with length-in-bytes not length-in-chars
    static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;

    // NOTE: always change this if you switch to a new format!
    static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;

    //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file 
    static final int FORMAT_SIZE = 4;

    static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
    static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;

    private FieldInfos fieldInfos;

    private IndexInput tvx;
    private IndexInput tvd;
    private IndexInput tvf;
    private int size;
    private int numTotalDocs;

    // The docID offset where our docs begin in the index
    // file.  This will be 0 if we have our own private file.
    private int docStoreOffset;

    private final int format;

    TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos) throws CorruptIndexException, IOException {
        this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE);
    }

    TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize) throws CorruptIndexException, IOException {
        this(d, segment, fieldInfos, readBufferSize, -1, 0);
    }

    TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size) throws CorruptIndexException, IOException {
        boolean success = false;

        try {
            String idxName = IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_INDEX_EXTENSION);
            tvx = d.openInput(idxName, readBufferSize);
            format = checkValidFormat(idxName, tvx);
            String fn = IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
            tvd = d.openInput(fn, readBufferSize);
            final int tvdFormat = checkValidFormat(fn, tvd);
            fn = IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_FIELDS_EXTENSION);
            tvf = d.openInput(fn, readBufferSize);
            final int tvfFormat = checkValidFormat(fn, tvf);

            assert format == tvdFormat;
            assert format == tvfFormat;

            if (format >= FORMAT_VERSION2) {
                numTotalDocs = (int) (tvx.length() >> 4);
            } else {
                assert (tvx.length() - FORMAT_SIZE) % 8 == 0;
                numTotalDocs = (int) (tvx.length() >> 3);
            }

            if (-1 == docStoreOffset) {
                this.docStoreOffset = 0;
                this.size = numTotalDocs;
                assert size == 0 || numTotalDocs == size;
            } else {
                this.docStoreOffset = docStoreOffset;
                this.size = size;
                // Verify the file is long enough to hold all of our
                // docs
                assert numTotalDocs >= size + docStoreOffset : "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset;
            }

            this.fieldInfos = fieldInfos;
            success = true;
        } finally {
            // With lock-less commits, it's entirely possible (and
            // fine) to hit a FileNotFound exception above. In
            // this case, we want to explicitly close any subset
            // of things that were opened so that we don't have to
            // wait for a GC to do so.
            if (!success) {
                close();
            }
        }
    }

    // Used for bulk copy when merging
    IndexInput getTvdStream() {
        return tvd;
    }

    // Used for bulk copy when merging
    IndexInput getTvfStream() {
        return tvf;
    }

    final private void seekTvx(final int docNum) throws IOException {
        if (format < FORMAT_VERSION2)
            tvx.seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
        else
            tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
    }

    boolean canReadRawDocs() {
        return format >= FORMAT_UTF8_LENGTH_IN_BYTES;
    }

    /** Retrieve the length (in bytes) of the tvd and tvf
     *  entries for the next numDocs starting with
     *  startDocID.  This is used for bulk copying when
     *  merging segments, if the field numbers are
     *  congruent.  Once this returns, the tvf & tvd streams
     *  are seeked to the startDocID. */
    final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException {

        if (tvx == null) {
            Arrays.fill(tvdLengths, 0);
            Arrays.fill(tvfLengths, 0);
            return;
        }

        // SegmentMerger calls canReadRawDocs() first and should
        // not call us if that returns false.
        if (format < FORMAT_VERSION2)
            throw new IllegalStateException("cannot read raw docs with older term vector formats");

        seekTvx(startDocID);

        long tvdPosition = tvx.readLong();
        tvd.seek(tvdPosition);

        long tvfPosition = tvx.readLong();
        tvf.seek(tvfPosition);

        long lastTvdPosition = tvdPosition;
        long lastTvfPosition = tvfPosition;

        int count = 0;
        while (count < numDocs) {
            final int docID = docStoreOffset + startDocID + count + 1;
            assert docID <= numTotalDocs;
            if (docID < numTotalDocs) {
                tvdPosition = tvx.readLong();
                tvfPosition = tvx.readLong();
            } else {
                tvdPosition = tvd.length();
                tvfPosition = tvf.length();
                assert count == numDocs - 1;
            }
            tvdLengths[count] = (int) (tvdPosition - lastTvdPosition);
            tvfLengths[count] = (int) (tvfPosition - lastTvfPosition);
            count++;
            lastTvdPosition = tvdPosition;
            lastTvfPosition = tvfPosition;
        }
    }

    private int checkValidFormat(String fn, IndexInput in) throws CorruptIndexException, IOException {
        int format = in.readInt();
        if (format > FORMAT_CURRENT) {
            throw new IndexFormatTooNewException(in, format, 1, FORMAT_CURRENT);
        }
        return format;
    }

    public void close() throws IOException {
        IOUtils.close(tvx, tvd, tvf);
    }

    /**
     * 
     * @return The number of documents in the reader
     */
    int size() {
        return size;
    }

    public void get(int docNum, String field, TermVectorMapper mapper) throws IOException {
        if (tvx != null) {
            int fieldNumber = fieldInfos.fieldNumber(field);
            //We need to account for the FORMAT_SIZE at when seeking in the tvx
            //We don't need to do this in other seeks because we already have the
            // file pointer
            //that was written in another file
            seekTvx(docNum);
            //System.out.println("TVX Pointer: " + tvx.getFilePointer());
            long tvdPosition = tvx.readLong();

            tvd.seek(tvdPosition);
            int fieldCount = tvd.readVInt();
            //System.out.println("Num Fields: " + fieldCount);
            // There are only a few fields per document. We opt for a full scan
            // rather then requiring that they be ordered. We need to read through
            // all of the fields anyway to get to the tvf pointers.
            int number = 0;
            int found = -1;
            for (int i = 0; i < fieldCount; i++) {
                if (format >= FORMAT_VERSION)
                    number = tvd.readVInt();
                else
                    number += tvd.readVInt();

                if (number == fieldNumber)
                    found = i;
            }

            // This field, although valid in the segment, was not found in this
            // document
            if (found != -1) {
                // Compute position in the tvf file
                long position;
                if (format >= FORMAT_VERSION2)
                    position = tvx.readLong();
                else
                    position = tvd.readVLong();
                for (int i = 1; i <= found; i++)
                    position += tvd.readVLong();

                mapper.setDocumentNumber(docNum);
                readTermVector(field, position, mapper);
            } else {
                //System.out.println("Fieldable not found");
            }
        } else {
            //System.out.println("No tvx file");
        }
    }

    /**
     * Retrieve the term vector for the given document and field
     * @param docNum The document number to retrieve the vector for
     * @param field The field within the document to retrieve
     * @return The TermFreqVector for the document and field or null if there is no termVector for this field.
     * @throws IOException if there is an error reading the term vector files
     */
    TermFreqVector get(int docNum, String field) throws IOException {
        // Check if no term vectors are available for this segment at all
        ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
        get(docNum, field, mapper);

        return mapper.materializeVector();
    }

    // Reads the String[] fields; you have to pre-seek tvd to
    // the right point
    final private String[] readFields(int fieldCount) throws IOException {
        int number = 0;
        String[] fields = new String[fieldCount];

        for (int i = 0; i < fieldCount; i++) {
            if (format >= FORMAT_VERSION)
                number = tvd.readVInt();
            else
                number += tvd.readVInt();

            fields[i] = fieldInfos.fieldName(number);
        }

        return fields;
    }

    // Reads the long[] offsets into TVF; you have to pre-seek
    // tvx/tvd to the right point
    final private long[] readTvfPointers(int fieldCount) throws IOException {
        // Compute position in the tvf file
        long position;
        if (format >= FORMAT_VERSION2)
            position = tvx.readLong();
        else
            position = tvd.readVLong();

        long[] tvfPointers = new long[fieldCount];
        tvfPointers[0] = position;

        for (int i = 1; i < fieldCount; i++) {
            position += tvd.readVLong();
            tvfPointers[i] = position;
        }

        return tvfPointers;
    }

    /**
     * Return all term vectors stored for this document or null if the could not be read in.
     * 
     * @param docNum The document number to retrieve the vector for
     * @return All term frequency vectors
     * @throws IOException if there is an error reading the term vector files 
     */
    TermFreqVector[] get(int docNum) throws IOException {
        TermFreqVector[] result = null;
        if (tvx != null) {
            //We need to offset by
            seekTvx(docNum);
            long tvdPosition = tvx.readLong();

            tvd.seek(tvdPosition);
            int fieldCount = tvd.readVInt();

            // No fields are vectorized for this document
            if (fieldCount != 0) {
                final String[] fields = readFields(fieldCount);
                final long[] tvfPointers = readTvfPointers(fieldCount);
                result = readTermVectors(docNum, fields, tvfPointers);
            }
        } else {
            //System.out.println("No tvx file");
        }
        return result;
    }

    public void get(int docNumber, TermVectorMapper mapper) throws IOException {
        // Check if no term vectors are available for this segment at all
        if (tvx != null) {
            //We need to offset by

            seekTvx(docNumber);
            long tvdPosition = tvx.readLong();

            tvd.seek(tvdPosition);
            int fieldCount = tvd.readVInt();

            // No fields are vectorized for this document
            if (fieldCount != 0) {
                final String[] fields = readFields(fieldCount);
                final long[] tvfPointers = readTvfPointers(fieldCount);
                mapper.setDocumentNumber(docNumber);
                readTermVectors(fields, tvfPointers, mapper);
            }
        } else {
            //System.out.println("No tvx file");
        }
    }

    private SegmentTermVector[] readTermVectors(int docNum, String fields[], long tvfPointers[]) throws IOException {
        SegmentTermVector res[] = new SegmentTermVector[fields.length];
        for (int i = 0; i < fields.length; i++) {
            ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
            mapper.setDocumentNumber(docNum);
            readTermVector(fields[i], tvfPointers[i], mapper);
            res[i] = (SegmentTermVector) mapper.materializeVector();
        }
        return res;
    }

    private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper) throws IOException {
        for (int i = 0; i < fields.length; i++) {
            readTermVector(fields[i], tvfPointers[i], mapper);
        }
    }

    /**
     * 
     * @param field The field to read in
     * @param tvfPointer The pointer within the tvf file where we should start reading
     * @param mapper The mapper used to map the TermVector
     * @throws IOException
     */
    private void readTermVector(String field, long tvfPointer, TermVectorMapper mapper) throws IOException {

        // Now read the data from specified position
        //We don't need to offset by the FORMAT here since the pointer already includes the offset
        tvf.seek(tvfPointer);

        int numTerms = tvf.readVInt();
        //System.out.println("Num Terms: " + numTerms);
        // If no terms - return a constant empty termvector. However, this should never occur!
        if (numTerms == 0)
            return;

        boolean storePositions;
        boolean storeOffsets;

        if (format >= FORMAT_VERSION) {
            byte bits = tvf.readByte();
            storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
            storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
        } else {
            tvf.readVInt();
            storePositions = false;
            storeOffsets = false;
        }
        mapper.setExpectations(field, numTerms, storeOffsets, storePositions);
        int start = 0;
        int deltaLength = 0;
        int totalLength = 0;
        byte[] byteBuffer;
        char[] charBuffer;
        final boolean preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;

        // init the buffers
        if (preUTF8) {
            charBuffer = new char[10];
            byteBuffer = null;
        } else {
            charBuffer = null;
            byteBuffer = new byte[20];
        }

        for (int i = 0; i < numTerms; i++) {
            start = tvf.readVInt();
            deltaLength = tvf.readVInt();
            totalLength = start + deltaLength;

            final String term;

            if (preUTF8) {
                // Term stored as java chars
                if (charBuffer.length < totalLength) {
                    charBuffer = ArrayUtil.grow(charBuffer, totalLength);
                }
                tvf.readChars(charBuffer, start, deltaLength);
                term = new String(charBuffer, 0, totalLength);
            } else {
                // Term stored as utf8 bytes
                if (byteBuffer.length < totalLength) {
                    byteBuffer = ArrayUtil.grow(byteBuffer, totalLength);
                }
                tvf.readBytes(byteBuffer, start, deltaLength);
                term = new String(byteBuffer, 0, totalLength, "UTF-8");
            }
            int freq = tvf.readVInt();
            int[] positions = null;
            if (storePositions) { //read in the positions
                //does the mapper even care about positions?
                if (mapper.isIgnoringPositions() == false) {
                    positions = new int[freq];
                    int prevPosition = 0;
                    for (int j = 0; j < freq; j++) {
                        positions[j] = prevPosition + tvf.readVInt();
                        prevPosition = positions[j];
                    }
                } else {
                    //we need to skip over the positions.  Since these are VInts, I don't believe there is anyway to know for sure how far to skip
                    //
                    for (int j = 0; j < freq; j++) {
                        tvf.readVInt();
                    }
                }
            }
            TermVectorOffsetInfo[] offsets = null;
            if (storeOffsets) {
                //does the mapper even care about offsets?
                if (mapper.isIgnoringOffsets() == false) {
                    offsets = new TermVectorOffsetInfo[freq];
                    int prevOffset = 0;
                    for (int j = 0; j < freq; j++) {
                        int startOffset = prevOffset + tvf.readVInt();
                        int endOffset = startOffset + tvf.readVInt();
                        offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
                        prevOffset = endOffset;
                    }
                } else {
                    for (int j = 0; j < freq; j++) {
                        tvf.readVInt();
                        tvf.readVInt();
                    }
                }
            }
            mapper.map(term, freq, offsets, positions);
        }
    }

    @Override
    protected Object clone() throws CloneNotSupportedException {

        final TermVectorsReader clone = (TermVectorsReader) super.clone();

        // These are null when a TermVectorsReader was created
        // on a segment that did not have term vectors saved
        if (tvx != null && tvd != null && tvf != null) {
            clone.tvx = (IndexInput) tvx.clone();
            clone.tvd = (IndexInput) tvd.clone();
            clone.tvf = (IndexInput) tvf.clone();
        }

        return clone;
    }
}

/**
 * Models the existing parallel array structure
 */
class ParallelArrayTermVectorMapper extends TermVectorMapper {

    private String[] terms;
    private int[] termFreqs;
    private int positions[][];
    private TermVectorOffsetInfo offsets[][];
    private int currentPosition;
    private boolean storingOffsets;
    private boolean storingPositions;
    private String field;

    @Override
    public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
        this.field = field;
        terms = new String[numTerms];
        termFreqs = new int[numTerms];
        this.storingOffsets = storeOffsets;
        this.storingPositions = storePositions;
        if (storePositions)
            this.positions = new int[numTerms][];
        if (storeOffsets)
            this.offsets = new TermVectorOffsetInfo[numTerms][];
    }

    @Override
    public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
        terms[currentPosition] = term;
        termFreqs[currentPosition] = frequency;
        if (storingOffsets) {
            this.offsets[currentPosition] = offsets;
        }
        if (storingPositions) {
            this.positions[currentPosition] = positions;
        }
        currentPosition++;
    }

    /**
     * Construct the vector
     * @return The {@link TermFreqVector} based on the mappings.
     */
    public TermFreqVector materializeVector() {
        SegmentTermVector tv = null;
        if (field != null && terms != null) {
            if (storingPositions || storingOffsets) {
                tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
            } else {
                tv = new SegmentTermVector(field, terms, termFreqs);
            }
        }
        return tv;
    }
}
